This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
day = read.csv("/Users/Murphy1/Desktop/DATA2020/Project2020/data/hour.csv", header = TRUE)
head(day)
library(dplyr)
#select variables season, workingday, weathersit, temp, atemp, hum, windspeed
day1 <- dplyr::select(day, -c(instant, dteday))
head(day1)
#drop atemp
day1 <- dplyr::select(day1, -c(atemp))
day1
# Check Line
fit.all = lm(cnt ~ season + yr + mnth + hr + holiday + workingday + weathersit + temp + hum + windspeed, data = day1)
intercept_only <- lm(cnt ~ 1, data=day1)
forward <- step(intercept_only, direction='forward',
scope=formula(fit.all))
Start: AIC=180764.7
cnt ~ 1
Df Sum of Sq RSS AIC
+ temp 1 93677759 478083832 177657
+ hr 1 88790198 482971393 177834
+ hum 1 59618351 512143240 178853
+ yr 1 35876722 535884870 179641
+ season 1 18127040 553634551 180207
+ weathersit 1 11598301 560163290 180411
+ mnth 1 8321115 563440476 180512
+ windspeed 1 4970060 566791531 180615
+ holiday 1 546889 571214702 180750
+ workingday 1 524387 571237204 180751
<none> 571761591 180765
Step: AIC=177657
cnt ~ temp
Df Sum of Sq RSS AIC
+ hr 1 66728221 411355610 175046
+ hum 1 49874585 428209247 175744
+ yr 1 31342263 446741569 176481
+ windspeed 1 6021342 472062489 177439
+ weathersit 1 5880680 472203151 177444
+ season 1 1696802 476387030 177597
+ mnth 1 906463 477177369 177626
+ holiday 1 225697 477858135 177651
<none> 478083832 177657
+ workingday 1 35467 478048365 177658
Step: AIC=175046.4
cnt ~ temp + hr
Df Sum of Sq RSS AIC
+ yr 1 32229070 379126540 173631
+ hum 1 25434149 385921461 173939
+ weathersit 1 5638993 405716617 174809
+ season 1 2995571 408360039 174921
+ windspeed 1 1712372 409643238 174976
+ mnth 1 1525503 409830107 174984
+ holiday 1 260174 411095436 175037
+ workingday 1 54016 411301595 175046
<none> 411355610 175046
Step: AIC=173630.5
cnt ~ temp + hr + yr
Df Sum of Sq RSS AIC
+ hum 1 20865128 358261412 172649
+ weathersit 1 5240157 373886383 173391
+ season 1 3515639 375610901 173471
+ mnth 1 1811568 377314972 173549
+ windspeed 1 1810495 377316045 173549
+ holiday 1 307712 378818828 173618
+ workingday 1 66617 379059923 173629
<none> 379126540 173631
Step: AIC=172648.7
cnt ~ temp + hr + yr + hum
Df Sum of Sq RSS AIC
+ season 1 7325038 350936374 172292
+ mnth 1 4840141 353421271 172414
+ holiday 1 367007 357894405 172633
+ weathersit 1 133898 358127514 172644
+ workingday 1 117604 358143808 172645
<none> 358261412 172649
+ windspeed 1 15517 358245895 172650
Step: AIC=172291.7
cnt ~ temp + hr + yr + hum + season
Df Sum of Sq RSS AIC
+ holiday 1 371167 350565206 172275
+ windspeed 1 165365 350771008 172286
+ workingday 1 131868 350804506 172287
<none> 350936374 172292
+ weathersit 1 37972 350898402 172292
+ mnth 1 799 350935575 172294
Step: AIC=172275.3
cnt ~ temp + hr + yr + hum + season + holiday
Df Sum of Sq RSS AIC
+ windspeed 1 165343 350399863 172269
+ workingday 1 47102 350518104 172275
+ weathersit 1 42022 350523184 172275
<none> 350565206 172275
+ mnth 1 0 350565206 172277
Step: AIC=172269.1
cnt ~ temp + hr + yr + hum + season + holiday + windspeed
Df Sum of Sq RSS AIC
+ weathersit 1 73738 350326125 172267
+ workingday 1 48197 350351667 172269
<none> 350399863 172269
+ mnth 1 1 350399863 172271
Step: AIC=172267.5
cnt ~ temp + hr + yr + hum + season + holiday + windspeed + weathersit
Df Sum of Sq RSS AIC
+ workingday 1 53900 350272225 172267
<none> 350326125 172267
+ mnth 1 1 350326124 172269
Step: AIC=172266.8
cnt ~ temp + hr + yr + hum + season + holiday + windspeed + weathersit +
workingday
Df Sum of Sq RSS AIC
<none> 350272225 172267
+ mnth 1 1.92 350272223 172269
summary(forward)
Call:
lm(formula = cnt ~ temp + hr + yr + hum + season + holiday +
windspeed + weathersit + workingday, data = day1)
Residuals:
Min 1Q Median 3Q Max
-366.86 -93.52 -27.97 60.40 644.46
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -9.4775 6.5712 -1.442 0.149246
temp 282.8600 6.0073 47.086 < 2e-16 ***
hr 7.6962 0.1649 46.680 < 2e-16 ***
yr 80.9848 2.1668 37.376 < 2e-16 ***
hum -197.1206 6.8663 -28.708 < 2e-16 ***
season 20.0853 1.0486 19.154 < 2e-16 ***
holiday -25.1349 6.6610 -3.773 0.000162 ***
windspeed 29.5888 9.4015 3.147 0.001651 **
weathersit -3.7797 1.9044 -1.985 0.047187 *
workingday 3.9204 2.3980 1.635 0.102096
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 142 on 17369 degrees of freedom
Multiple R-squared: 0.3874, Adjusted R-squared: 0.3871
F-statistic: 1220 on 9 and 17369 DF, p-value: < 2.2e-16
#forward
backward <- step(fit.all, direction='backward')
Start: AIC=172268.8
cnt ~ season + yr + mnth + hr + holiday + workingday + weathersit +
temp + hum + windspeed
Df Sum of Sq RSS AIC
- mnth 1 2 350272225 172267
<none> 350272223 172269
- workingday 1 53901 350326124 172269
- weathersit 1 79432 350351655 172271
- windspeed 1 199754 350471977 172277
- holiday 1 286716 350558939 172281
- season 1 2453586 352725809 172388
- hum 1 16562243 366834466 173070
- yr 1 28170183 378442406 173611
- hr 1 43896453 394168676 174319
- temp 1 44245313 394517536 174334
Step: AIC=172266.8
cnt ~ season + yr + hr + holiday + workingday + weathersit +
temp + hum + windspeed
Df Sum of Sq RSS AIC
<none> 350272225 172267
- workingday 1 53900 350326125 172267
- weathersit 1 79442 350351667 172269
- windspeed 1 199752 350471977 172275
- holiday 1 287145 350559370 172279
- season 1 7398820 357671045 172628
- hum 1 16620569 366892794 173070
- yr 1 28171966 378444191 173609
- hr 1 43943958 394216183 174319
- temp 1 44711469 394983694 174353
summary(backward)
Call:
lm(formula = cnt ~ season + yr + hr + holiday + workingday +
weathersit + temp + hum + windspeed, data = day1)
Residuals:
Min 1Q Median 3Q Max
-366.86 -93.52 -27.97 60.40 644.46
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -9.4775 6.5712 -1.442 0.149246
season 20.0853 1.0486 19.154 < 2e-16 ***
yr 80.9848 2.1668 37.376 < 2e-16 ***
hr 7.6962 0.1649 46.680 < 2e-16 ***
holiday -25.1349 6.6610 -3.773 0.000162 ***
workingday 3.9204 2.3980 1.635 0.102096
weathersit -3.7797 1.9044 -1.985 0.047187 *
temp 282.8600 6.0073 47.086 < 2e-16 ***
hum -197.1206 6.8663 -28.708 < 2e-16 ***
windspeed 29.5888 9.4015 3.147 0.001651 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 142 on 17369 degrees of freedom
Multiple R-squared: 0.3874, Adjusted R-squared: 0.3871
F-statistic: 1220 on 9 and 17369 DF, p-value: < 2.2e-16
#day1$cnt <- log(day1$cnt)
mod_forward <- lm(formula = cnt ~ temp + hr + yr + hum + season + holiday +
windspeed + weathersit + workingday, data = day1)
plot(mod_forward, which = c(1,2,3,4))
mod_backward <- lm(formula = cnt ~ season + yr + hr + holiday + workingday +
weathersit + temp + hum + windspeed, data = day1)
plot(mod_backward, which = c(1,2,3,4))
day1$cnt <- log(day1$cnt)
mod_backward <- lm(formula = cnt ~ season + yr + hr + holiday + workingday +
weathersit + temp + hum + windspeed, data = day1)
plot(mod_backward, which = c(1,2,3,4))
# Check the variance
library(car)
ncvTest(fit.all)
Non-constant Variance Score Test
Variance formula: ~ fitted.values
Chisquare = 2063.008, Df = 1, p = < 2.22e-16
backward <- step(fit.all, direction='backward')
Start: AIC=172268.8
cnt ~ season + yr + mnth + hr + holiday + workingday + weathersit +
temp + hum + windspeed
Df Sum of Sq RSS AIC
- mnth 1 2 350272225 172267
<none> 350272223 172269
- workingday 1 53901 350326124 172269
- weathersit 1 79432 350351655 172271
- windspeed 1 199754 350471977 172277
- holiday 1 286716 350558939 172281
- season 1 2453586 352725809 172388
- hum 1 16562243 366834466 173070
- yr 1 28170183 378442406 173611
- hr 1 43896453 394168676 174319
- temp 1 44245313 394517536 174334
Step: AIC=2483.88
cnt ~ season + yr + hr + holiday + workingday + weathersit +
temp + hum + windspeed
Df Sum of Sq RSS AIC
- weathersit 1 0.0 20026 2481.9
<none> 20026 2483.9
- workingday 1 12.4 20038 2492.6
- windspeed 1 13.0 20039 2493.2
- holiday 1 21.5 20048 2500.6
- season 1 477.2 20503 2891.2
- yr 1 773.6 20800 3140.6
- hum 1 881.3 20907 3230.3
- temp 1 2336.1 22362 4399.4
- hr 1 7693.2 27719 8131.6
Step: AIC=2481.9
cnt ~ season + yr + hr + holiday + workingday + temp + hum +
windspeed
Df Sum of Sq RSS AIC
<none> 20026 2481.9
- workingday 1 12.5 20039 2490.7
- windspeed 1 13.2 20039 2491.4
- holiday 1 21.5 20048 2498.6
- season 1 478.6 20505 2890.3
- yr 1 774.0 20800 3139.0
- hum 1 1114.1 21140 3420.8
- temp 1 2349.3 22376 4407.7
- hr 1 7788.8 27815 8189.5
summary(backward)
Call:
lm(formula = cnt ~ season + yr + hr + holiday + workingday +
temp + hum + windspeed, data = day1)
Residuals:
Min 1Q Median 3Q Max
-3.9871 -0.6398 0.1151 0.7121 3.1887
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.631172 0.049540 53.112 < 2e-16 ***
season 0.161355 0.007920 20.374 < 2e-16 ***
yr 0.424309 0.016376 25.911 < 2e-16 ***
hr 0.101812 0.001239 82.193 < 2e-16 ***
holiday -0.217687 0.050364 -4.322 1.55e-05 ***
workingday -0.059567 0.018112 -3.289 0.001008 **
temp 2.045017 0.045303 45.141 < 2e-16 ***
hum -1.438355 0.046271 -31.086 < 2e-16 ***
windspeed 0.237638 0.070215 3.384 0.000715 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.074 on 17370 degrees of freedom
Multiple R-squared: 0.4782, Adjusted R-squared: 0.478
F-statistic: 1990 on 8 and 17370 DF, p-value: < 2.2e-16
library(plyr)
library(readr)
library(dplyr)
library(glmnet)
library(tibble)
library(caret)
set.seed(42)
train_index <- createDataPartition(day1$cnt, p = 0.8, list = FALSE)
train_set <- day1[train_index,]
test_set <- day1[-train_index,]
# Train the multiple linear regression model
model <- lm(formula = cnt ~ season + yr + hr + holiday + workingday +
weathersit + temp + hum + windspeed, data = train_set)
# Make predictions on the test set
predictions <- predict(model, newdata = test_set)
# Evaluate the model
mae <- mean(abs(test_set$cnt - predictions))
mse <- mean((test_set$cnt - predictions)^2)
r_squared <- 1 - (sum((test_set$cnt - predictions)^2) / sum((test_set$cnt - mean(test_set$cnt))^2))
cat('Mean Absolute Error:', mae, '\n')
Mean Absolute Error: 0.8430607
cat('Mean Squared Error:', mse, '\n')
Mean Squared Error: 1.156954
cat('R-squared:', r_squared, '\n')
R-squared: 0.4713828
rmse <- function(actual, predicted) {
return(sqrt(mean((actual - predicted)^2)))
}
rmse_value <- rmse(test_set$cnt, predictions)
cat('Root Mean Squared Error:', rmse_value, '\n')
Root Mean Squared Error: 1.075618
day2 <- day1[ , -which(names(day1) == "cnt")]
pca <- prcomp(day2,scale=TRUE)
variance <- summary(pca)$importance[2,]
df <- data.frame(Component = 1:length(variance), Variance = variance)
df_top_10 <- df[1:10,]
df_top_10 <- df_top_10[order(-df_top_10$Variance),]
df_top_10
pc <- pca$rotation[, 1: 10]
z <- melt(pc)
colnames(z) <- c("Variable", "PC", "value")
plot <- ggplot(z, aes(x = PC, y = Variable, fill= value)) +
geom_tile(color = 'white')+
scale_fill_gradient2(low = 'brown', high = 'wheat', mid = 'ivory',
midpoint = 0, limit = c(-1,1), space = 'Lab')
plot
pca_data <- as.data.frame(predict(pca))
df2 <- pca_data[, 1:2]
df1 <- subset(day1, select = cnt)
combined_df <- cbind(df1, df2)
library(caret)
train_index <- createDataPartition(combined_df$cnt, p = 0.8, list = FALSE)
train_set <- combined_df[train_index,]
test_set <- combined_df[-train_index,]
# Train the multiple linear regression model
model2 <- lm(cnt ~ ., data = train_set)
summary(model2)
Call:
lm(formula = cnt ~ ., data = train_set)
Residuals:
Min 1Q Median 3Q Max
-3.9418 -0.4606 0.1181 0.6162 2.7952
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.537284 0.007966 569.55 <2e-16 ***
PC1 0.720840 0.005032 143.24 <2e-16 ***
PC2 0.113545 0.005553 20.45 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.9394 on 13901 degrees of freedom
Multiple R-squared: 0.601, Adjusted R-squared: 0.601
F-statistic: 1.047e+04 on 2 and 13901 DF, p-value: < 2.2e-16
# Make predictions on the test set
predictions <- predict(model2, newdata = test_set)
# Evaluate the model
mae <- mean(abs(test_set$cnt - predictions))
mse <- mean((test_set$cnt - predictions)^2)
r_squared <- 1 - (sum((test_set$cnt - predictions)^2) / sum((test_set$cnt - mean(test_set$cnt))^2))
summary(predictions)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.916 3.689 4.405 4.555 5.292 8.573